/******************************************************************************
This file pre-cleans data for tests (regents, sat, and psat) for NYC.
We re-scale SAT scores to the new scale according to a cross-
walk provided by the collegeboard. We rescale the regents tests to
account for a change in the test design. Note: year refers to spring year.
--------------------------------------------------------------------------------
Input:			Raw SAT CSVs that we received from NYC
				..\RAW DATA FINAL\NYCDOE\SAT\XX_XX_SAT_Admin_Scrambled.csv"
--------------------------------------------------------------------------------
Intermediate:	Appended Raw SAT data
					> sat_all_appended_raw.dta
--------------------------------------------------------------------------------
Output:			Cleaned SAT files
					> sat_nyc_cleaned.dta
********************************************************************************/

* 	settings
	set more off
	set trace off
	set tracedepth 1
	pause on

*	append switches	(since each append takes a while)
	local append_sat			1

clear all

************************************************************************************************************************
*********************************  SAT  ********************************************************************************
************************************************************************************************************************

* I. Append all years
set rmsg off
if `append_sat' == 1 {
	set obs 1
	gen x = .
	tempfile append_sat
	sa `append_sat'

	// Loop over years
	forval year = 2006/2015 {

		local nextyear = `year' + 1
		local next_digits = substr( "`nextyear'", 3, 2)
		di "`year'"
		use "${cleandata}outcomes\SAT/`year'-`next_digits'_SAT_Admin_Scrambled.dta", clear
		gen sat_file_year = `year'
		destring sat_*, replace

		* Covert old into new scores
		gen sat_math_old = sat_math
		gen sat_wr_cr_old = sat_critical_reading + sat_writing
		gen sat_total_old = sat_wr_cr_old + sat_math_old

		* Merge crosswalks from collegeboard
		destring stu sat_math_old sat_wr_cr_old sat_total_old, replace
		merge m:1 sat_math_old using "${rawdata}Online/SAT PSAT Conversion/sat_conversion_scale_math.dta", keep(1 3) nogen
		merge m:1 sat_wr_cr_old using "${rawdata}Online/SAT PSAT Conversion/sat_conversion_scale_ebrw.dta", keep(1 3) nogen
		merge m:1 sat_total_old using "${rawdata}Online/SAT PSAT Conversion/sat_conversion_scale_total.dta", keep(1 3) nogen
		tostring stu, replace

		rename sat_math_new sat_math_section
		rename sat_total_new sat_total

		keep stu test_month test_year sat_file_year sat_math_section sat_total ///
				sat_ebrw sat_math_old sat_wr_cr_old sat_total_old

		append using `append_sat'
		sa `append_sat', replace

	}

	drop x

	* Append the new test for 15/16
	use "${cleandata}outcomes\SAT\2015-16_SAT_Admin_New_Scrambled.dta", clear
	gen new_test = 1
	gen sat_file_year = 2015

	tostring stu, replace

	destring sat_*, replace

	append using `append_sat'
	sa `append_sat', replace

	drop x

	* Append the new files for 16/17
	use "${cleandata}outcomes\SAT\2016-17_SAT_Admin_New_Scrambled.dta", clear
	gen new_test = 1
	gen sat_file_year = 2016

	tostring stu, replace
	destring sat_*, replace


	append using `append_sat'
	sa `append_sat', replace

	drop x

	sa "${cleandata}sat_all_appended_raw.dta", replace
}

	use "${cleandata}sat_all_appended_raw.dta", clear

	rename student_id_scram stu

// II. Clean Data

	//rename SAT math
	ren sat_math_section sat_math

	//update new test indicator
	replace new_test = 0 if new_test != 1
	gen sat_math_new = sat_math if  new_test == 1
	gen sat_read_new = sat_ebrw if  new_test == 1
	gen sat_total_new = sat_total if  new_test == 1

	//test date
	destring test_month test_year, replace
	gen testdate = mdy(test_month, 1,test_year )

	//most recent score
	bys stu : egen max_testdate = max(testdate)
	foreach score in sat_ebrw sat_math {
		bys stu : egen `score'_last = max((max_testdate== testdate) * `score')
	}

	//highest score
	foreach score in sat_ebrw sat_math {
		bys stu : egen `score'_max = max( `score')
	}

	//keep first test
	bys stu: egen min_testdate = min(testdate)
	drop if testdate != min_testdate

	//still some duplicates remaining, just same record in multiple file
	duplicates drop  stu sat_total sat_ebrw sat_math sat_math_old sat_wr_cr_old sat_total_old, force

	//some students took test twice in same monthxyear, take the max of total
	bys stu : egen max_total = max(sat_total)
	drop if sat_total != max_total

	//few more students have same score in same monthxyear, take max of math then ela
	bys stu : egen max_math = max(sat_math)
	drop if sat_math != max_math
	bys stu : egen max_ebrw = max(sat_ebrw)
	drop if sat_ebrw != max_ebrw

	//assure data is unique on student
	drop if stu == ""
	isid stu

	//terms
	gen term_sat = "f" if test_month > 9 & test_month <= 12 | test_month == 1
	replace term_sat = "s" if test_month > 2 & test_month <= 6

	//get spring years
	replace test_year = test_year + 1 if term_sat == "f"
	ren test_year year_sat

	//get years of most recent and highest scores
	bys stu : egen year_last_sat = max( (max_testdate== testdate) * year_sat)
	gen term_last_sat = "s"
	bys stu : egen year_max_sat = max( (sat_math_max== sat_math) * year_sat)
	gen term_max_sat = "s"


	preserve
		//get 9th grade year (refers to spring year)
		use "${cleandata}/demo_all_years_all_grade.dta", clear
		keep if grade == "09"
		bysort stu: egen year_grade9 = min(year)
		keep if year == year_grade9
		keep stu year_grade9
		tempfile 9thgrade_year
		save `9thgrade_year'

		//get grades so we can standardize
		use "${cleandata}/demo_all_years_all_grade.dta", clear
		keep stu year grade
		rename year year_sat
		tempfile grades
		save `grades'
	restore

	//keep only students with grades
	tostring stu, replace
	merge 1:1 stu year_sat using "`grades'", keep(3) nogen
	merge 1:1 stu using "`9thgrade_year'", keep(1 3) nogen
	destring stu, replace

	//set to missing if before 9th grade
	drop if grade == "AD"
	destring grade, replace
	drop if grade < 9
	drop if year_sat < year_grade9 & year_grade9 != .

	//rename
	ren (test_month max_total new_test testdate min_testdate max_testdate sat_wr_cr_old sat_read_new sat_ebrw*) ///
		(month_sat max_total_sat new_test_sat testdate_sat min_testdate_sat max_testdate_sat sat_ela_old sat_ela_new sat_ela*)
	drop school_day *old* *new* *last* *max* min_testdate_sat new_test_sat sat_file_year year_grade9

	//standardize by test year and test grade
	foreach score in sat_total sat_ela sat_math{
		bys year_sat grade: egen mean = mean(`score')
		bys year_sat grade: egen sd = sd(`score')
		gen `score'_raw = `score'
		replace `score' = (`score' - mean)/sd
		drop mean sd
	}

	sa "${cleandata}sat_nyc_cleaned.dta", replace
